import numpy as np
import pandas as pd
import seaborn as sns
import matplotlib.pyplot as plt
from sklearn.preprocessing import LabelEncoder, MinMaxScaler
from sklearn.model_selection import train_test_split, GridSearchCV
from sklearn.linear_model import LogisticRegression
from sklearn.metrics import classification_report, confusion_matrix, accuracy_score, roc_auc_score

# Load and prepare the dataset
columns = ["duration","protocoltype","service","flag","srcbytes","dstbytes","land", 
           "wrongfragment","urgent","hot","numfailedlogins","loggedin", "numcompromised",
           "rootshell","suattempted","numroot","numfilecreations", "numshells","numaccessfiles",
           "numoutboundcmds","ishostlogin", "isguestlogin","count","srvcount","serrorrate", 
           "srvserrorrate","rerrorrate","srvrerrorrate","samesrvrate", "diffsrvrate","srvdiffhostrate",
           "dsthostcount","dsthostsrvcount","dsthostsamesrvrate", "dsthostdiffsrvrate", 
           "dsthostsamesrcportrate","dsthostsrvdiffhostrate","dsthostserrorrate","dsthostsrvserrorrate",
           "dsthostrerrorrate","dsthostsrvrerrorrate","attack", "lastflag"]

# Loading data
data = pd.read_csv("train.txt", sep=",", names=columns)
data_test = pd.read_csv("test.txt", sep=",", names=columns)

# Data Preprocessing
# Removing irrelevant features
data.drop(['land', 'urgent', 'numfailedlogins', 'numoutboundcmds'], axis=1, inplace=True)

# Handling missing values
data = data.dropna(axis=0)

# Encoding categorical variables
label_encoder = LabelEncoder()
data['protocoltype'] = label_encoder.fit_transform(data['protocoltype'])
data['service'] = label_encoder.fit_transform(data['service'])
data['flag'] = label_encoder.fit_transform(data['flag'])

# Convert 'attack' feature to binary classification
data['attack'] = np.where(data['attack'] != "normal", "attack", "normal")
data['attack'] = label_encoder.fit_transform(data['attack'])

# Data Visualization
# Correlation heatmap
plt.figure(figsize=(20, 15))
sns.heatmap(data.corr())
plt.show()

# Feature Scaling
scaler = MinMaxScaler()
X = data.drop("attack", axis=1)
X = scaler.fit_transform(X)
y = data['attack']

# Splitting data into training and testing sets
X_train, X_test, y_train, y_test = train_test_split(X, y, test_size=0.3, random_state=42)

# Model Training: Logistic Regression
log_reg = LogisticRegression()
log_reg.fit(X_train, y_train)

# Predictions
predictions = log_reg.predict(X_test)

# Model Evaluation
print("Accuracy:", accuracy_score(y_test, predictions))
print("ROC-AUC Score:", roc_auc_score(y_test, predictions))
print("Classification Report:\n", classification_report(y_test, predictions))
print("Confusion Matrix:\n", confusion_matrix(y_test, predictions))

# Hyperparameter Tuning with GridSearchCV
param_grid = {'C': [0.01, 0.1, 1, 10, 100], 'penalty': ['l1', 'l2']}
grid_search = GridSearchCV(log_reg, param_grid, cv=5)
grid_search.fit(X_train, y_train)

# Best Parameters and Improved Predictions
best_params = grid_search.best_params_
print("Best Parameters:", best_params)
improved_predictions = grid_search.predict(X_test)

# Evaluation of Improved Model
print("Improved Accuracy:", accuracy_score(y_test, improved_predictions))
print("Improved ROC-AUC Score:", roc_auc_score(y_test, improved_predictions))
print("Improved Classification Report:\n", classification_report(y_test, improved_predictions))
print("Improved Confusion Matrix:\n", confusion_matrix(y_test, improved_predictions))
